%matplotlib inline
#python includes
import sys
#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
#bootstrapping on confidence interval on the difference in means:
iters = 1000 #iterations
N1 = 500 #observations
N2 = 100 #observations
#pretend this data was real:
X1 = pd.Series(np.random.normal(100, 1., N1))
X2 = pd.Series(np.random.normal(99, 0.5, N2))
X1.hist(bins = N1/10, alpha=.7)
X2.hist(bins = N2/10, alpha=.5)
#plt.show()
#compute original difference
origDiff = X1.mean() - X2.mean()
print "original diff:", origDiff
#sys.exit(1)
reDiffs = list()
for i in range(iters):
#draw a random resampling of #drawing from the hat:
reX1indices = [int(d) for d in np.random.uniform(0, N1-1, N1)]
reX2indices = [int(d) for d in np.random.uniform(0, N2-1, N2)]
#print reX1indices
#sys.exit(1)
reX1 = X1[reX1indices] #resampled X1
reX2 = X2[reX2indices] #resampled X2
#reX1.hist(bins = N1/10, alpha=.3)
reDiff = reX1.mean() - reX2.mean()
#print "resampled diff", reDiff
reDiffs.append(reDiff)
plt.show()
sortedReDiffs = pd.Series(sorted(reDiffs))
#print sortedReDiffs.head()
sortedReDiffs.hist(bins = iters/6)
print "histogram of bootstrapped diffs"
sRDdesc = sortedReDiffs.describe(percentiles=[.025, .975])
plt.show()
print "The difference in means is %.4f with 95%% CI: [%.4f, %.4f]" % (origDiff,sRDdesc['2.5%'], sRDdesc['97.5%'])
(m is number of features)
import statsmodels.api as sm#load the iris data:
iris = pd.read_csv('iris.csv')
print iris.head()
print iris.describe()
#example: predicting Sepal Length from the other iris characteristics:
iris_train = iris[:130] #training data
iris_test = iris[130:150] #testing data
model = sm.OLS(iris_train['SepalLength'], iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']]).fit()
y_hat = np.dot(iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']], model.params) #model.params are betas...
#y true, y predicted, error
print np.array([iris_test['SepalLength'], y_hat, iris_test['SepalLength'] - y_hat]).T
print "mean squared error: %.3f" % np.mean((iris_test['SepalLength'] - y_hat)**2)
#an example of overfitting by using too many features compared to training observations:
#Let's setup a train and test (no development because we aren't setting any extra parameters)
iris_train = iris[:10] #training data
iris_test = iris[100:] #testing data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_2feats = iris_train[['PetalWidth','SepalWidth']] #uppercase => matrix
#Train (fit) the models:
lr_result_3f = sm.OLS(y_train, X_3feats).fit()
lr_result_2f = sm.OLS(y_train, X_2feats).fit()
betas_3f = lr_result_3f.params
print "\nbetas for 3 features:\n", betas_3f
betas_2f = lr_result_2f.params
print "\nbetas for 2 features:\n", betas_2f
#setup test
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_2f = iris_test[['PetalWidth','SepalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']
#apply model to test and check eror:
y_hat_test_3f = np.dot(X_test_3f, betas_3f)
y_hat_train_3f = np.dot(X_3feats, betas_3f)
y_hat_test_2f = np.dot(X_test_2f, betas_2f)
error_3f = np.mean((y_test - y_hat_test_3f)**2)
error_3ftrain = np.mean((y_train - y_hat_train_3f)**2)
error_2f = np.mean((y_test - y_hat_test_2f)**2)
print "\nMSerror when using 3 feats(test): %.2f" % error_3f
print "MSerror when using 3 feats(train): %.2f" % error_3ftrain
print "MSerror when using 2 feats: %.2f" % error_2f
#Regularization Comparison
from sklearn.linear_model import Ridge, Lasso
#RIDGE:
weights = []
mses = []
#alphas = ([.001, .01, .1, 1, 10]
alphas = [(2**i)/10000.0 for i in xrange(20)]
print "standard deviation of y_hats as alpha increases"
for alpha in alphas:
model = Ridge(alpha=alpha)
model.fit(X_3feats, y_train)
weights.append(model.coef_)
#print model.coef_
#figure out accuracy:
y_hat = model.predict(X_test_3f)
print y_hat.std()
mses.append(np.sum((y_test - y_hat)**2))
plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)#
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
#LASSO:
weights = []
mses = []
alphas = alphas[:13]
for alpha in alphas:
model = Lasso(alpha=alpha)
model.fit(X_3feats, y_train)
weights.append(model.coef_)
y_hat = model.predict(X_test_3f)
mses.append(np.sum((y_test - y_hat)**2))
plt.plot([[i]*len(weights[0]) for i in xrange(len(alphas))], weights)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
plt.plot([i for i in xrange(len(alphas))], mses)
plt.xticks([i for i in xrange(len(alphas))], alphas, rotation='vertical')
plt.show()
#re-setup data
y_train = iris_train['SepalLength'] #from now on, lowercase => vector
X_3feats = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
X_test_3f = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth']] #uppercase => matrix
y_test = iris_test['SepalLength']
#turn y into a classification problem:
split_point = y_train.mean()
y_train = [1 if i else 0 for i in y_train > split_point]
y_test = [1 if i else 0 for i in y_test > split_point]
print y_train
plt.scatter(X_3feats[['SepalWidth']], y_train)
plt.xlabel('SepalWidth')
plt.ylabel('Is Long?')
from sklearn.linear_model import LogisticRegression
#L2:
weights = []
mses = []
#alphas = np.array([.1, 1, 10, 100, 1000, 10000, 100000])
alphas = [.1*3**i for i in xrange(14)]
Cs = alphas[::-1] #reverse
for c in Cs:
model = LogisticRegression(penalty='l2', C=c)
model.fit(X_3feats, y_train)
weights.append(model.coef_[0])
#print model.coef_
#figure out accuracy:
y_hat = model.predict(X_test_3f)
mses.append(np.sum((y_test - y_hat)**2))
#print weights
plt.plot([[i]*len(weights[0]) for i in xrange(len(Cs))], weights)#
plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
plt.show()
# plt.plot([i for i in xrange(len(Cs))], mses)
#plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
# plt.show()
#L1
weights = []
mses = []
Cs = Cs[3:-2]
for c in Cs:
model = LogisticRegression(penalty='l1', C=c)
model.fit(X_3feats, y_train)
weights.append(model.coef_[0])
#print model.coef_
#figure out accuracy:
y_hat = model.predict(X_test_3f)
mses.append(np.sum((y_test - y_hat)**2))
#print weights
plt.plot([[i]*len(weights[0]) for i in xrange(len(Cs))], weights)#
plt.xticks([i for i in xrange(len(Cs))], Cs, rotation='vertical')
plt.show()
#setup data:
X = iris[['PetalWidth','SepalWidth']] #'SepalLength', 'PetalLength'
X.plot(kind='scatter', x = 'PetalWidth', y ='SepalWidth', alpha=.3)
plt.show()
#run kmeans using sklearn:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2)
model.fit(X) #run KMeans
clusters = model.predict(X)
print "Clusters assigned", clusters
#plot the results
plt.scatter(X['PetalWidth'], X['SepalWidth'], c = clusters, alpha=.4) #color-coded points
centers = model.cluster_centers_
#mark the centers
plt.scatter(centers[:, 0], centers[:, 1], marker='x', s=200, linewidths=5, color='teal', alpha=.6)
#setup data (could use same as above but changing it up for variety)
X = iris[['SepalLength', 'PetalWidth']]
#X = iris[['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']]
X = (X - X.mean()) #mean center the data
X.plot(kind='scatter', x ='SepalLength', y = 'PetalWidth', alpha=.4)
#sys.exit(1)
#Run PCA, using sklearn:
from sklearn.decomposition import PCA
model = PCA(n_components = 2)
model.fit(X)
#let's look at the components (directions)
V = model.components_ #components = directions
print "v martrix:\n", V
plt.scatter([V[0][0]], [V[0][1]], c = [9])
#sys.exit(1)
#draw each direction:
for c in V:
slope = c[1]/c[0]
xpoints = np.linspace(float(X[[0]].min()), float(X[[0]].max()), 10)
UDc = [xp*slope for xp in xpoints] #the pricipal component for the given direction
plt.plot(xpoints, UDc)
#plt.ylim([-3,3])
#plt.xlim([-3,3])
plt.show()
#sys.exit(1)
#let's apply the transform based on v:
X_lowdim = model.transform(X)
X_lowdim2 = np.dot(X,V.T) #same as above
print V
print X.head()
print X_lowdim[:5]
print X_lowdim2[:5]
Xld = pd.Series(X_lowdim[:,0])
Xld.hist()
plt.scatter(Xld, [-1]*len(Xld), alpha = 0.3)
plt.show()
#sys.exit(1)
#explained variance
print "percentage variance explained:" , model.explained_variance_ratio_
var = X_lowdim.std(axis=0)**2
print "variance per component ", var
print "our percentage variance explained", [vi / sum(var) for vi in var]
#setup class names to be flower names
print iris[:3]
names = list(set(iris['Name'].tolist()))
nameToIndex = dict([(names[i],i) for i in range(len(names))])
print nameToIndex
#setup X and y:
iris = iris.iloc[np.random.permutation(len(iris))]
iris_train = iris[:100] #training data
iris_test = iris[100:] #testing data
Xtrain = iris_train[['SepalWidth', 'PetalLength', 'PetalWidth','SepalLength']]
#Xtrain = iris_train[['SepalWidth']]
Xtest = iris_test[['SepalWidth', 'PetalLength', 'PetalWidth','SepalLength']]
#Xtest = iris_test[['SepalWidth']]
ytrain = np.array([nameToIndex[n] for n in iris_train['Name'].tolist()])
ytest = np.array([nameToIndex[n] for n in iris_test['Name'].tolist()])
Xtrain[['SepalWidth']].hist()
plt.show()
#setup
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(Xtrain, ytrain)
print "Class priors: ", clf.class_prior_
print "\nClass means: \n", clf.theta_
print "\nClass variance: \n", clf.sigma_
#lets just look at one test example:
oneExample = Xtest.iloc[6]
print oneExample
print "\npredicted class: ", clf.predict([oneExample])
print "\nprobabilities for each class:", ["%.6f"%p for p in clf.predict_proba([oneExample])[0]]
#check accuracy for all predictions
y_hat = clf.predict(Xtest)
accuracy = sum([1 if y_hat[i] == ytest[i] else 0 for i in range(len(ytest))]) / float(len(ytest))
print accuracy
#Let's make up some time series data:
#each index is a day; a day is 50% likely to continue in the same direction as the average of the last 3 days
def MakeFakeTimeSeries(mod = None):
#start with a random 3 days.
x = ss.norm(10, 1).rvs(size=3)
#print x
for i in range(3, 30):
#current trend direction:
currentTrendDir = 1 if x[i-1] - x[i-3] > 0 else -1
#Find a magnitude of change:
magnitudeOfChange = ss.norm(1, .7).rvs(size=1)[0]
#% chance it continues with same direction
changeDirection = 1 if ss.bernoulli(.75).rvs(size=1)[0]>0 else -1
delta = magnitudeOfChange * (currentTrendDir * changeDirection)
#print (currentTrendDir, magnitudeOfChange, changeDirection, delta)
x = np.append(x, x[i-1] + delta)
#print x
return pd.Series(x)
x1 = MakeFakeTimeSeries()
x2 = MakeFakeTimeSeries()
x1.plot(kind='line')
x2.plot(kind='line')
print ss.pearsonr(x1, x2)
#Run this multiple times to see how often time-series like data correlate with themself
def corrWithSelf(y, lag = 1):
return ss.pearsonr(y[:-1*lag], y[lag:])
fakeTS = x1
print "fake TS autocorrelation with lag = 1: ", corrWithSelf(fakeTS)
#white noise:
wnoise = pd.Series(ss.norm(0, 1).rvs(size=100))
wnoise.plot(kind='line')
plt.show()
print "With noise autocorrelation with lag = 1: ", corrWithSelf(wnoise, 1)
#periodic data (sinusoidal):
x = np.linspace(0, 50, 100)
ssdal = pd.Series(np.sin(x) + np.random.random(100))
ssdal.plot(kind='line', lw=3, alpha=.5)
print "sinusoidal autocorrelation with lag = 1: ", corrWithSelf(ssdal)
print "sinusoidal autocorrelation with lag = 4: ", corrWithSelf(ssdal, 4)
print "sinusoidal autocorrelation with lag = 7: ", corrWithSelf(ssdal, 7)
print "sinusoidal autocorrelation with lag = 10: ", corrWithSelf(ssdal, 10)
print "sinusoidal autocorrelation with lag = 13: ", corrWithSelf(ssdal, 13)
#plot the lag=13 version:
pd.Series(ssdal[13:].tolist()).plot(kind='line', lw=3, alpha=.6)
plt.show()
fcs = {'2003-12': 0.1991, '2003-10': 0.2686, '2003-11': 0.1956, '2014-09': 1.9471, '2014-08': 1.9398, '2014-05': 1.6808, '2014-04': 1.7328, '2014-07': 1.983, '2014-06': 1.8564, '2001-10': 0.25, '2001-11': 0.2393, '2001-12': 0.1948, '2014-02': 1.6022, '2010-01': 2.1771, '2010-03': 2.3553, '2010-02': 2.1249, '2010-05': 2.1108, '2010-04': 2.3586, '2002-08': 0.529, '2002-09': 0.5384, '2002-06': 0.3795, '2002-07': 0.4673, '2002-04': 0.2561, '2002-05': 0.3084, '2002-02': 0.203, '2002-03': 0.2409, '2013-02': 1.0462, '2002-01': 0.2237, '2000-01': 0.4537, '2000-02': 0.4531, '2000-03': 0.4669, '2000-04': 0.4489, '2000-05': 0.4922, '2000-06': 0.4004, '2000-07': 0.3288, '2000-08': 0.2904, '2000-09': 0.3035, '2012-07': 0.7055, '2004-12': 0.684, '2004-11': 0.6087, '2004-10': 0.5991, '2012-04': 0.8279, '2015-06': 2.5328, '2015-07': 2.4714, '2015-04': 2.7388, '2015-05': 2.5827, '2015-02': 2.3967, '2015-03': 2.6437, '2015-01': 2.1805, '2014-12': 2.2094, '2014-01': 1.5996, '2014-10': 2.1816, '2014-11': 2.163, '2011-11': 0.8462, '2011-10': 0.7601, '2015-08': 2.6307, '2011-12': 0.9108, '2012-09': 0.9374, '2012-08': 0.7143, '2014-03': 1.6035, '2012-03': 0.8842, '2012-02': 0.8243, '2012-01': 0.8573, '2010-12': 1.2324, '2012-06': 0.7444, '2010-10': 2.1948, '2010-11': 1.6971, '2012-05': 0.7355, '2004-04': 0.7475, '2004-05': 0.71, '2004-06': 0.7159, '2004-07': 0.7272, '2004-01': 0.4498, '2004-02': 0.6706, '2004-03': 0.7329, '2013-08': 1.2974, '2004-08': 0.6687, '2004-09': 0.5591, '2013-09': 1.3191, '2011-08': 0.722, '2011-09': 0.7176, '2010-07': 1.8553, '2011-02': 0.7774, '2011-03': 0.7912, '2010-06': 1.9288, '2011-06': 0.6414, '2011-07': 0.6254, '2011-04': 0.7136, '2011-05': 0.6154, '2010-09': 2.2013, '2007-02': 0.8849, '2010-08': 2.1603, '2012-10': 0.9287, '2012-11': 0.8851, '2012-12': 0.9112, '2013-06': 1.0281, '2007-09': 1.2881, '2007-08': 1.3326, '2013-07': 1.1001, '2006-11': 0.7659, '2001-09': 0.2584, '2006-12': 0.7898, '2007-01': 0.8339, '2007-03': 0.9444, '2001-08': 0.2581, '2007-05': 1.1781, '2007-04': 1.056, '2007-07': 1.294, '2007-06': 1.2395, '2005-03': 0.7924, '2005-02': 0.7441, '2005-01': 0.7297, '2005-07': 0.727, '2005-06': 0.7982, '2005-05': 0.7518, '2005-04': 0.7811, '2005-09': 0.6982, '2005-08': 0.7234, '2003-09': 0.3893, '2003-08': 0.5029, '2007-12': 1.5599, '2007-10': 1.5662, '2007-11': 1.599, '2006-02': 0.8533, '2006-03': 0.9117, '2006-01': 0.9119, '2006-06': 0.8982, '2006-07': 0.9514, '2006-04': 0.869, '2006-05': 0.8514, '2008-12': 1.8907, '2006-08': 0.9158, '2006-09': 0.8749, '2005-10': 0.7011, '2005-11': 0.6953, '2005-12': 0.7685, '2008-08': 1.8377, '2008-09': 1.911, '2013-04': 0.9641, '2008-01': 1.6453, '2008-02': 1.7691, '2008-03': 1.945, '2008-04': 1.9918, '2008-05': 1.9785, '2008-06': 1.9643, '2008-07': 1.8963, '2013-01': 0.9678, '2013-10': 1.3612, '2013-03': 1.0694, '2011-01': 0.9035, '2015-11': 2.9898, '1998-09': 0.3616, '1998-08': 0.3926, '1998-05': 0.3818, '1998-04': 0.3728, '1998-07': 0.431, '1998-06': 0.4074, '1998-01': 0.4211, '1998-03': 0.4276, '1998-02': 0.4382, '2015-10': 2.8921, '1999-11': 0.3886, '1999-10': 0.4095, '1999-12': 0.4362, '2008-11': 1.9555, '2008-10': 2.1204, '1999-06': 0.4582, '1999-07': 0.4643, '1999-04': 0.488, '1999-05': 0.4591, '1999-02': 0.3732, '1999-03': 0.4864, '1999-01': 0.3894, '1998-12': 0.4039, '1998-10': 0.3764, '1998-11': 0.3995, '1999-08': 0.456, '1999-09': 0.4113, '2015-12': 3.2485, '2009-07': 2.1571, '2009-06': 2.0605, '2009-05': 2.0474, '2009-04': 2.1199, '2009-03': 2.1778, '2009-02': 1.8791, '2009-01': 1.8403, '2009-09': 2.1493, '2009-08': 2.2106, '2003-05': 0.4944, '2003-04': 0.5394, '2003-07': 0.4775, '2003-06': 0.4736, '2003-01': 0.5452, '2003-03': 0.5142, '2003-02': 0.4805, '2001-07': 0.2446, '2001-06': 0.2633, '2001-05': 0.3091, '2001-04': 0.3604, '2001-03': 0.3872, '2001-02': 0.3901, '2001-01': 0.3247, '2002-11': 0.5827, '2002-10': 0.5392, '2002-12': 0.6221, '2015-09': 2.6822, '2013-12': 1.4755, '2013-11': 1.412, '2006-10': 0.8813, '2000-12': 0.3148, '2000-11': 0.269, '2000-10': 0.3237, '2013-05': 1.0311, '2009-10': 2.0288, '2009-11': 2.0994, '2009-12': 2.0438}
fcs=pd.DataFrame(fcs.items()).sort_values(0)
fcs=fcs.set_index(0)
fcs.plot(kind="line")
plt.xlabel("year-month")
print "NY Foreclosures autocorrelation with lag = 1: ", corrWithSelf(ssdal)
def autocorrelate_plot(y):
series = [corrWithSelf(y, lag = lag)[0] for lag in range(1, int(len(y)-15))]
return pd.Series(series)
ac_fcs = autocorrelate_plot(fcs.loc[:,1])
ac_fcs.plot(kind="line")
plt.xlabel("lag")
##more autocorrelation plots:
ac_fts = autocorrelate_plot(fakeTS)
ac_fts.plot(kind="line")
plt.xlabel("lag")
plt.show()
ac_wn = autocorrelate_plot(wnoise)
ac_wn.plot(kind="line")
plt.xlabel("lag")
plt.show()
ac_ssdal = autocorrelate_plot(ssdal)
ac_ssdal.plot(kind="line")
plt.xlabel("lag")
plt.show()